Import important libraries
rawdf = read.csv("loans.csv")
df = rawdf
df = df[c(-1)]
str(df)
## 'data.frame': 9578 obs. of 14 variables:
## $ credit.policy : chr "1" "1" "1" "1" ...
## $ purpose : chr "debt_consolidation" "credit_card" "debt_consolidation" "debt_consolidation" ...
## $ int.rate : num 0.119 0.107 0.136 0.101 0.143 ...
## $ installment : num 829 228 367 162 103 ...
## $ log.annual.inc : num 11.4 11.1 10.4 11.4 11.3 ...
## $ dti : chr "19.48" "14.29" "11.63" "8.1" ...
## $ fico : int 737 707 682 712 667 727 667 722 682 707 ...
## $ days.with.cr.line: num 5640 2760 4710 2700 4066 ...
## $ revol.bal : int 28854 33623 3511 33667 4740 50807 3839 24220 69909 5630 ...
## $ revol.util : chr "52.1" "76.7" "25.6" "73.2" ...
## $ inq.last.6mths : int 0 0 1 1 0 0 0 0 1 1 ...
## $ delinq.2yrs : chr "0" "0" "0" "0" ...
## $ pub.rec : chr "0" "0" "0" "0" ...
## $ not.fully.paid : int 0 0 0 0 0 0 1 1 0 0 ...
head(df)
## credit.policy purpose int.rate installment log.annual.inc dti
## 1 1 debt_consolidation 0.1189 829.10 11.35041 19.48
## 2 1 credit_card 0.1071 228.22 11.08214 14.29
## 3 1 debt_consolidation 0.1357 366.86 10.37349 11.63
## 4 1 debt_consolidation 0.1008 162.34 11.35041 8.1
## 5 1 credit_card 0.1426 102.92 11.29973 14.97
## 6 1 credit_card 0.0788 125.13 11.90497 16.98
## fico days.with.cr.line revol.bal revol.util inq.last.6mths delinq.2yrs
## 1 737 5639.958 28854 52.1 0 0
## 2 707 2760.000 33623 76.7 0 0
## 3 682 4710.000 3511 25.6 1 0
## 4 712 2699.958 33667 73.2 1 0
## 5 667 4066.000 4740 39.5 0 1
## 6 727 6120.042 50807 51 0 0
## pub.rec not.fully.paid
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
Change char type to num
cols_int = c(1,12,13)
df[cols_int] = sapply(df[cols_int],as.integer)
## Warning in lapply(X = X, FUN = FUN, ...): NAs introduced by coercion
## Warning in lapply(X = X, FUN = FUN, ...): NAs introduced by coercion
## Warning in lapply(X = X, FUN = FUN, ...): NAs introduced by coercion
cols_num = c(6,10)
df[cols_num] = sapply(df[cols_num],as.numeric)
## Warning in lapply(X = X, FUN = FUN, ...): NAs introduced by coercion
## Warning in lapply(X = X, FUN = FUN, ...): NAs introduced by coercion
change purpose to factor type
df$purpose <-factor(df$purpose)
str(df)
## 'data.frame': 9578 obs. of 14 variables:
## $ credit.policy : int 1 1 1 1 1 1 1 1 1 1 ...
## $ purpose : Factor w/ 7 levels "all_other","credit_card",..: 3 2 3 3 2 2 3 1 5 3 ...
## $ int.rate : num 0.119 0.107 0.136 0.101 0.143 ...
## $ installment : num 829 228 367 162 103 ...
## $ log.annual.inc : num 11.4 11.1 10.4 11.4 11.3 ...
## $ dti : num 19.5 14.3 11.6 8.1 15 ...
## $ fico : int 737 707 682 712 667 727 667 722 682 707 ...
## $ days.with.cr.line: num 5640 2760 4710 2700 4066 ...
## $ revol.bal : int 28854 33623 3511 33667 4740 50807 3839 24220 69909 5630 ...
## $ revol.util : num 52.1 76.7 25.6 73.2 39.5 51 76.8 68.6 51.1 23 ...
## $ inq.last.6mths : int 0 0 1 1 0 0 0 0 1 1 ...
## $ delinq.2yrs : int 0 0 0 0 1 0 0 0 0 0 ...
## $ pub.rec : int 0 0 0 0 0 0 1 0 0 0 ...
## $ not.fully.paid : int 0 0 0 0 0 0 1 1 0 0 ...
sum(is.na(df))
## [1] 191
(sum(is.na(df))/nrow(df)) * 100
## [1] 1.994153
number of missing data and percentage
We can delete all the rows which has the missing value because it’s only 1.9% pf the dataset
df = na.omit(df)
nrow(df)
## [1] 9508
sum(is.na(df))
## [1] 0
Delete the row with missing values and we still have 9508 rows
summary(df)
## credit.policy purpose int.rate installment
## Min. :0.0000 all_other :2288 Min. : 0.0600 Min. : 15.67
## 1st Qu.:1.0000 credit_card :1259 1st Qu.: 0.1039 1st Qu.:164.02
## Median :1.0000 debt_consolidation:3944 Median : 0.1221 Median :269.55
## Mean :0.8076 educational : 340 Mean : 0.1256 Mean :320.15
## 3rd Qu.:1.0000 home_improvement : 627 3rd Qu.: 0.1407 3rd Qu.:435.23
## Max. :1.0000 major_purchase : 432 Max. :14.7000 Max. :940.14
## small_business : 618
## log.annual.inc dti fico days.with.cr.line
## Min. : 7.548 Min. : 0.000 Min. : 612.0 Min. : 180
## 1st Qu.:10.565 1st Qu.: 7.228 1st Qu.: 682.0 1st Qu.: 2820
## Median :10.933 Median :12.700 Median : 707.0 Median : 4140
## Mean :10.934 Mean :12.630 Mean : 711.2 Mean : 4566
## 3rd Qu.:11.293 3rd Qu.:18.000 3rd Qu.: 737.0 3rd Qu.: 5730
## Max. :14.528 Max. :29.960 Max. :1812.0 Max. :17640
##
## revol.bal revol.util inq.last.6mths delinq.2yrs
## Min. : 0 Min. : 0.00 Min. : 0.000 Min. : 0.0000
## 1st Qu.: 3273 1st Qu.: 22.70 1st Qu.: 0.000 1st Qu.: 0.0000
## Median : 8690 Median : 46.40 Median : 1.000 Median : 0.0000
## Mean : 16998 Mean : 46.94 Mean : 1.573 Mean : 0.1637
## 3rd Qu.: 18375 3rd Qu.: 71.00 3rd Qu.: 2.000 3rd Qu.: 0.0000
## Max. :1207359 Max. :670.00 Max. :33.000 Max. :13.0000
##
## pub.rec not.fully.paid
## Min. :0.00000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.:0.0000
## Median :0.00000 Median :0.0000
## Mean :0.06216 Mean :0.1597
## 3rd Qu.:0.00000 3rd Qu.:0.0000
## Max. :5.00000 Max. :1.0000
##
Find the correlation of the data to fico
cor_matrix = cor(df[-c(2)], method = "spearman")
names = rownames(cor_matrix)
abs_cor = abs(cor_matrix)
data = data.frame(X_var = names,abs_cor = abs_cor,cor = cor_matrix)
cortmp = data[order(data$abs_cor.fico)]
cortmp['abs_cor.fico']
## abs_cor.fico
## credit.policy 0.35843775
## int.rate 0.74680656
## installment 0.08555879
## log.annual.inc 0.10600806
## dti 0.21387901
## fico 1.00000000
## days.with.cr.line 0.25133508
## revol.bal 0.09472345
## revol.util 0.51904804
## inq.last.6mths 0.17666043
## delinq.2yrs 0.23756086
## pub.rec 0.14881785
## not.fully.paid 0.14706919
We could know that ‘int.rate’,‘revol.util’, and ‘credit.policy’ has correlation with ‘fico’
cor_matrix = cor(df[-c(2)], method = "spearman",)
cor_matrix = round(cor_matrix, 2)
cor_matrix
## credit.policy int.rate installment log.annual.inc dti fico
## credit.policy 1.00 -0.30 0.07 0.03 -0.09 0.36
## int.rate -0.30 1.00 0.24 0.04 0.22 -0.75
## installment 0.07 0.24 1.00 0.43 0.06 0.09
## log.annual.inc 0.03 0.04 0.43 1.00 -0.06 0.11
## dti -0.09 0.22 0.06 -0.06 1.00 -0.21
## fico 0.36 -0.75 0.09 0.11 -0.21 1.00
## days.with.cr.line 0.11 -0.13 0.20 0.40 0.07 0.25
## revol.bal -0.02 0.15 0.35 0.42 0.37 -0.09
## revol.util -0.11 0.47 0.09 0.05 0.33 -0.52
## inq.last.6mths -0.43 0.18 0.00 0.03 0.03 -0.18
## delinq.2yrs -0.06 0.17 -0.01 0.03 -0.02 -0.24
## pub.rec -0.05 0.10 -0.03 0.01 0.01 -0.15
## not.fully.paid -0.16 0.15 0.04 -0.03 0.04 -0.15
## days.with.cr.line revol.bal revol.util inq.last.6mths
## credit.policy 0.11 -0.02 -0.11 -0.43
## int.rate -0.13 0.15 0.47 0.18
## installment 0.20 0.35 0.09 0.00
## log.annual.inc 0.40 0.42 0.05 0.03
## dti 0.07 0.37 0.33 0.03
## fico 0.25 -0.09 -0.52 -0.18
## days.with.cr.line 1.00 0.32 0.00 -0.04
## revol.bal 0.32 1.00 0.52 -0.02
## revol.util 0.00 0.52 1.00 -0.01
## inq.last.6mths -0.04 -0.02 -0.01 1.00
## delinq.2yrs 0.09 -0.06 -0.03 0.02
## pub.rec 0.10 -0.03 0.07 0.06
## not.fully.paid -0.03 0.02 0.08 0.13
## delinq.2yrs pub.rec not.fully.paid
## credit.policy -0.06 -0.05 -0.16
## int.rate 0.17 0.10 0.15
## installment -0.01 -0.03 0.04
## log.annual.inc 0.03 0.01 -0.03
## dti -0.02 0.01 0.04
## fico -0.24 -0.15 -0.15
## days.with.cr.line 0.09 0.10 -0.03
## revol.bal -0.06 -0.03 0.02
## revol.util -0.03 0.07 0.08
## inq.last.6mths 0.02 0.06 0.13
## delinq.2yrs 1.00 0.00 0.01
## pub.rec 0.00 1.00 0.06
## not.fully.paid 0.01 0.06 1.00
corrplot(cor_matrix,method='number')
## Check for the outliers
bar <- ggplot(df,aes(fico))+geom_histogram(aes(fill=factor(not.fully.paid)),color='black',bins = 40,alpha=0.5)
bar+scale_fill_manual(values = c("#FF5733","#44FF33"))+theme_bw()
boxplot(fico~purpose,data=df,col='orange')
ggplot(df) +
aes(x = fico, y = int.rate) +
geom_point(shape = "circle", size = 1.5, colour = "#228B22")
We have outliers, we need to handle with which int.rate > 10 and fico > 1500
df = df[-which(df$fico > 850),]
df = df[-which(df$int.rate> 5),]
df = df[-which(df$revol.util > 500),]
df = df[-which(df$revol.bal > 75000),]
ggplot(df) +
aes(x = fico, y = int.rate) +
geom_point(shape = "circle", size = 1.5, colour = "#228B22")
ggplot(df) +
aes(x = revol.bal, y = revol.util) +
geom_point(shape = "circle", size = 1.5, colour = "#228B22")
ggplot(df) +
aes(x = fico, y = inq.last.6mths) +
geom_point(shape = "circle", size = 1.5, colour = "#228B22")
ggplot(df) +
aes(x = fico, y = delinq.2yrs) +
geom_point(shape = "circle", size = 1.5, colour = "#228B22")
bar<- ggplot(df,aes(factor(purpose)))+geom_bar(aes(fill=factor(not.fully.paid)),position='dodge')
bar+theme(axis.text.x =element_text(angle = 90,size = 10,vjust = 0.5))+theme_bw()
box <- ggplot(df,aes(fico))+geom_histogram(aes(fill=factor(not.fully.paid)),color='black',bins = 40,alpha=0.5)
box+scale_fill_manual(values = c("#FF5733","#44FF33"))+theme_bw()
boxplot(fico~purpose,data=df,col='orange')
and now we delete the outliers of variable int.rate, revol.util and fico
Split for linear regression by using split ratio at 0.7
set.seed(99)
split = sample.split (df$fico, SplitRatio = 0.70)
df_train = subset(df, split == TRUE)
df_test = subset(df, split == FALSE)
Building Linear Model
linearMod = lm(fico ~ .,data = df_train)
summary(linearMod)
##
## Call:
## lm(formula = fico ~ ., data = df_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -94.37 -13.33 -1.35 11.99 109.71
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.075e+02 5.801e+00 139.207 < 2e-16 ***
## credit.policy 1.022e+01 8.483e-01 12.052 < 2e-16 ***
## purposecredit_card -3.065e+00 8.973e-01 -3.415 0.000642 ***
## purposedebt_consolidation -3.540e+00 6.799e-01 -5.207 1.98e-07 ***
## purposeeducational -3.203e+00 1.449e+00 -2.210 0.027124 *
## purposehome_improvement 2.021e+00 1.117e+00 1.810 0.070417 .
## purposemajor_purchase 5.146e-01 1.281e+00 0.402 0.688006
## purposesmall_business 1.354e+01 1.188e+00 11.395 < 2e-16 ***
## int.rate -8.491e+02 1.274e+01 -66.655 < 2e-16 ***
## installment 4.330e-02 1.535e-03 28.213 < 2e-16 ***
## log.annual.inc -4.301e-01 5.322e-01 -0.808 0.419058
## dti -1.747e-01 4.153e-02 -4.206 2.63e-05 ***
## days.with.cr.line 2.123e-03 1.139e-04 18.637 < 2e-16 ***
## revol.bal -2.431e-05 2.371e-05 -1.026 0.305093
## revol.util -2.989e-01 1.150e-02 -25.997 < 2e-16 ***
## inq.last.6mths 1.395e-01 1.441e-01 0.968 0.333113
## delinq.2yrs -9.043e+00 4.632e-01 -19.522 < 2e-16 ***
## pub.rec -9.746e+00 9.821e-01 -9.924 < 2e-16 ***
## not.fully.paid -3.436e+00 7.137e-01 -4.815 1.51e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20.25 on 6425 degrees of freedom
## Multiple R-squared: 0.7158, Adjusted R-squared: 0.715
## F-statistic: 899.1 on 18 and 6425 DF, p-value: < 2.2e-16
df_test$FicoLinear = predict(linearMod,df_test)
LinearFiRM = rmse(df_test$fico, df_test$FicoLinear)
LinearFiRM
## [1] 20.6761
Now we have rmse value at 20.871 by using linear model
linearMod = lm(fico ~ int.rate * purpose+ int.rate*installment + dti*days.with.cr.line +delinq.2yrs*pub.rec*not.fully.paid,data = df_train)
summary(linearMod)
##
## Call:
## lm(formula = fico ~ int.rate * purpose + int.rate * installment +
## dti * days.with.cr.line + delinq.2yrs * pub.rec * not.fully.paid,
## data = df_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -118.267 -13.196 -1.576 11.802 98.017
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.172e+02 3.083e+00 265.061 < 2e-16 ***
## int.rate -1.035e+03 2.393e+01 -43.236 < 2e-16 ***
## purposecredit_card 2.411e+00 4.447e+00 0.542 0.587799
## purposedebt_consolidation -8.452e+00 3.287e+00 -2.571 0.010155 *
## purposeeducational -2.736e+00 7.238e+00 -0.378 0.705415
## purposehome_improvement 1.009e+01 5.143e+00 1.962 0.049862 *
## purposemajor_purchase 2.678e+00 5.761e+00 0.465 0.641999
## purposesmall_business -4.860e+01 5.658e+00 -8.590 < 2e-16 ***
## installment 7.216e-02 6.877e-03 10.493 < 2e-16 ***
## dti 2.922e-02 8.044e-02 0.363 0.716403
## days.with.cr.line 3.158e-03 2.187e-04 14.439 < 2e-16 ***
## delinq.2yrs -7.781e+00 5.420e-01 -14.356 < 2e-16 ***
## pub.rec -1.138e+01 1.198e+00 -9.498 < 2e-16 ***
## not.fully.paid -4.470e+00 8.205e-01 -5.448 5.29e-08 ***
## int.rate:purposecredit_card -7.579e+01 3.676e+01 -2.062 0.039257 *
## int.rate:purposedebt_consolidation 2.082e+01 2.658e+01 0.783 0.433609
## int.rate:purposeeducational 4.973e+00 6.001e+01 0.083 0.933963
## int.rate:purposehome_improvement -6.012e+01 4.235e+01 -1.419 0.155819
## int.rate:purposemajor_purchase -6.813e+00 4.925e+01 -0.138 0.889968
## int.rate:purposesmall_business 4.783e+02 4.142e+01 11.546 < 2e-16 ***
## int.rate:installment -1.730e-01 5.145e-02 -3.362 0.000778 ***
## dti:days.with.cr.line -9.891e-05 1.559e-05 -6.344 2.39e-10 ***
## delinq.2yrs:pub.rec 6.188e+00 1.559e+00 3.969 7.28e-05 ***
## delinq.2yrs:not.fully.paid -1.494e+00 1.493e+00 -1.001 0.317014
## pub.rec:not.fully.paid 2.317e+00 2.563e+00 0.904 0.366056
## delinq.2yrs:pub.rec:not.fully.paid -3.100e+00 4.505e+00 -0.688 0.491385
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 21.33 on 6418 degrees of freedom
## Multiple R-squared: 0.685, Adjusted R-squared: 0.6838
## F-statistic: 558.3 on 25 and 6418 DF, p-value: < 2.2e-16
df_test$FicoLinear = predict(linearMod,df_test)
LinearFiRM = rmse(df_test$fico, df_test$FicoLinear)
LinearFiRM
## [1] 22.19836
split at the rate of 0.7
set.seed(99)
split = sample.split (df$not.fully.paid, SplitRatio = 0.70)
df_train = subset(df, split == TRUE)
df_test = subset(df, split == FALSE)
For Baseline Accuracy
t = table(df_test$not.fully.paid)
t
##
## 0 1
## 2329 432
accuracy = t[1]/sum(t)
cat("The accuracy is", round(accuracy,2))
## The accuracy is 0.84
Baseline accuracy is at 84%
LogRegModel = glm(not.fully.paid ~., family = binomial, df_train)
summary(LogRegModel)
##
## Call:
## glm(formula = not.fully.paid ~ ., family = binomial, data = df_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6005 -0.6169 -0.4956 -0.3656 2.5350
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 8.401e+00 1.610e+00 5.218 1.81e-07 ***
## credit.policy -3.753e-01 1.041e-01 -3.606 0.000311 ***
## purposecredit_card -6.647e-01 1.387e-01 -4.792 1.65e-06 ***
## purposedebt_consolidation -3.892e-01 9.405e-02 -4.138 3.50e-05 ***
## purposeeducational 4.720e-02 1.857e-01 0.254 0.799368
## purposehome_improvement 5.299e-02 1.557e-01 0.340 0.733632
## purposemajor_purchase -4.685e-01 2.061e-01 -2.274 0.022991 *
## purposesmall_business 4.845e-01 1.437e-01 3.372 0.000747 ***
## int.rate 6.223e-01 2.129e+00 0.292 0.770074
## installment 1.420e-03 2.193e-04 6.476 9.42e-11 ***
## log.annual.inc -3.669e-01 7.705e-02 -4.762 1.92e-06 ***
## dti -2.530e-03 5.738e-03 -0.441 0.659310
## fico -9.208e-03 1.741e-03 -5.290 1.22e-07 ***
## days.with.cr.line 1.846e-05 1.640e-05 1.126 0.260087
## revol.bal 1.470e-06 3.338e-06 0.440 0.659640
## revol.util 3.810e-03 1.626e-03 2.343 0.019117 *
## inq.last.6mths 6.623e-02 1.611e-02 4.111 3.94e-05 ***
## delinq.2yrs -8.960e-02 6.690e-02 -1.339 0.180500
## pub.rec 8.764e-02 1.244e-01 0.705 0.481022
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 5592.1 on 6441 degrees of freedom
## Residual deviance: 5238.3 on 6423 degrees of freedom
## AIC: 5276.3
##
## Number of Fisher Scoring iterations: 5
df_test$PredictedRisk = predict(LogRegModel, type = "response", df_test)
LogRegPredict = predict(LogRegModel, type = "response", df_test)
plot(df_test$PredictedRisk)
df_test$PredictedRisk_Cat = ifelse(df_test$PredictedRisk > 0.25,1,0)
t = table(df_test$not.fully.paid, df_test$PredictedRisk_Cat)
Accuracy Test
t
##
## 0 1
## 0 2090 239
## 1 309 123
accuracy = sum(diag(t)/sum(t))
cat("The accuracy is", round(accuracy,2))
## The accuracy is 0.8
ROCRpred = prediction (LogRegPredict, df_test$not.fully.paid)
ROCRperf = performance (ROCRpred, "tpr", "fpr")
plot (ROCRperf, colorize = TRUE, print.cutoffs.at = seq (0, 1, by = 0.01), text.adj = c(-0.2, 1.7))
abline(v=0.25)
AUCVal6 = as.numeric (performance (ROCRpred, "auc") @y.values)
cat("at threshold = 0.25 AUC Value is :", AUCVal6)
## at threshold = 0.25 AUC Value is : 0.6817671
AUC Value around 0.68 at threshold = 0.25
What is the best threshold value to maximize true positive rate while keeping false positive at max 25% (or 0.25)?
LogRegModel = glm(not.fully.paid ~ ., family = binomial, df_train)
summary(LogRegModel)
##
## Call:
## glm(formula = not.fully.paid ~ ., family = binomial, data = df_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6005 -0.6169 -0.4956 -0.3656 2.5350
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 8.401e+00 1.610e+00 5.218 1.81e-07 ***
## credit.policy -3.753e-01 1.041e-01 -3.606 0.000311 ***
## purposecredit_card -6.647e-01 1.387e-01 -4.792 1.65e-06 ***
## purposedebt_consolidation -3.892e-01 9.405e-02 -4.138 3.50e-05 ***
## purposeeducational 4.720e-02 1.857e-01 0.254 0.799368
## purposehome_improvement 5.299e-02 1.557e-01 0.340 0.733632
## purposemajor_purchase -4.685e-01 2.061e-01 -2.274 0.022991 *
## purposesmall_business 4.845e-01 1.437e-01 3.372 0.000747 ***
## int.rate 6.223e-01 2.129e+00 0.292 0.770074
## installment 1.420e-03 2.193e-04 6.476 9.42e-11 ***
## log.annual.inc -3.669e-01 7.705e-02 -4.762 1.92e-06 ***
## dti -2.530e-03 5.738e-03 -0.441 0.659310
## fico -9.208e-03 1.741e-03 -5.290 1.22e-07 ***
## days.with.cr.line 1.846e-05 1.640e-05 1.126 0.260087
## revol.bal 1.470e-06 3.338e-06 0.440 0.659640
## revol.util 3.810e-03 1.626e-03 2.343 0.019117 *
## inq.last.6mths 6.623e-02 1.611e-02 4.111 3.94e-05 ***
## delinq.2yrs -8.960e-02 6.690e-02 -1.339 0.180500
## pub.rec 8.764e-02 1.244e-01 0.705 0.481022
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 5592.1 on 6441 degrees of freedom
## Residual deviance: 5238.3 on 6423 degrees of freedom
## AIC: 5276.3
##
## Number of Fisher Scoring iterations: 5
df_test$PredictedRisk = predict(LogRegModel, type = "response", df_test)
LogRegPredict = predict(LogRegModel, type = "response", df_test)
plot(df_test$PredictedRisk)
df_test$PredictedRisk_Cat = ifelse(df_test$PredictedRisk > 0.185,1,0)
t = table(df_test$not.fully.paid, df_test$PredictedRisk_Cat)
t
##
## 0 1
## 0 1735 594
## 1 218 214
accuracy = sum(diag(t)/sum(t))
cat("The accuracy is", round(accuracy,2))
## The accuracy is 0.71
Best threshold if we want to keep false potive at 0.25 is 0.185
ROCRpred = prediction (LogRegPredict, df_test$not.fully.paid)
ROCRperf = performance (ROCRpred, "tpr", "fpr")
plot (ROCRperf, colorize = TRUE, print.cutoffs.at = seq (0, 1, by = 0.1), text.adj = c(-0.2, 1.7))
as.numeric (performance (ROCRpred, "auc") @y.values)
## [1] 0.6817671
Using only int.rate to create Regression model
LogRegModel = glm(not.fully.paid ~ int.rate , family = binomial, df_train)
summary(LogRegModel)
##
## Call:
## glm(formula = not.fully.paid ~ int.rate, family = binomial, data = df_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.0532 -0.6209 -0.5356 -0.4298 2.3052
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.722 0.173 -21.52 <2e-16 ***
## int.rate 16.135 1.303 12.39 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 5592.1 on 6441 degrees of freedom
## Residual deviance: 5434.2 on 6440 degrees of freedom
## AIC: 5438.2
##
## Number of Fisher Scoring iterations: 4
df_test$PredictedRisk = predict(LogRegModel, type = "response", df_test)
LogRegPredict = predict(LogRegModel, type = "response", df_test)
plot(df_test$PredictedRisk)
df_test$PredictedRisk_Cat = ifelse(df_test$PredictedRisk > 0.20,1,0)
t = table(df_test$not.fully.paid, df_test$PredictedRisk_Cat)
Accuracy of my simple model
t
##
## 0 1
## 0 1907 422
## 1 305 127
accuracy = sum(diag(t)/sum(t))
cat("The accuracy is", round(accuracy,2))
## The accuracy is 0.74
ROCRpred = prediction (LogRegPredict, df_test$not.fully.paid)
ROCRperf = performance (ROCRpred, "tpr", "fpr")
plot (ROCRperf, colorize = TRUE, print.cutoffs.at = seq (0, 1, by = 0.05), text.adj = c(-0.2, 1.7))
AUCVal = as.numeric (performance (ROCRpred, "auc") @y.values)
cat("The AUC Value of my model is", round(AUCVal,3))
## The AUC Value of my model is 0.622
investment = df_test
investment$profit = (1 + investment$int.rate) ^ 3 - 1
investment$profit[investment$PredictedRisk_Cat == 1] = -1
summary(investment$profit)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.0000 0.2510 0.3728 0.1073 0.4250 0.4984
sum(investment$profit)
## [1] 296.1645
head(investment)
## credit.policy purpose int.rate installment log.annual.inc dti
## 4 1 debt_consolidation 0.1008 162.34 11.35041 8.10
## 6 1 credit_card 0.0788 125.13 11.90497 16.98
## 25 1 debt_consolidation 0.1229 320.19 11.26446 8.80
## 27 1 all_other 0.0743 155.38 11.08214 0.28
## 31 1 debt_consolidation 0.0807 156.84 11.51293 2.30
## 41 1 home_improvement 0.0807 156.84 12.10071 5.55
## fico days.with.cr.line revol.bal revol.util inq.last.6mths delinq.2yrs
## 4 712 2699.958 33667 73.2 1 0
## 6 727 6120.042 50807 51.0 0 0
## 25 672 3760.958 4822 58.1 0 0
## 27 802 4649.958 1576 5.7 1 0
## 31 742 3148.958 9698 19.4 0 0
## 41 742 4019.000 40934 26.3 0 0
## pub.rec not.fully.paid PredictedRisk PredictedRisk_Cat profit
## 4 0 0 0.10955325 0 0.3339061
## 6 0 0 0.07941841 0 0.2555176
## 25 1 0 0.14947352 0 0.4158696
## 27 0 0 0.07426959 0 0.2398716
## 31 0 0 0.08168879 0 0.2621630
## 41 0 0 0.08168879 0 0.2621630
What is the average profit of a $1 investment in one of these high-interest loans? What proportion of the high-interest loans were not paid back in full?
HighInterest = df_test
HighInterest = HighInterest[which(HighInterest$int.rate >0.15),]
HighInterest$profit = (1 + HighInterest$int.rate) ^ 3 - 1
HighInterest$profit[HighInterest$PredictedRisk_Cat ==1] = -1
summary(HighInterest$profit)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1 -1 -1 -1 -1 -1
HighIntProfit = sum(HighInterest$profit)
proportion = mean(HighInterest$PredictedRisk_Cat)
proportion
## [1] 1
10.1 Every 1 dollar you invest you will lose 8.88 cent ( or 0.08 dollars )by use the mean of profit variable 10.2 Proportion of the high-interest loans were not paid back in full is 0.69 or 69%
What is the profit to an investor who invested $1 in each of these 100 loans? How does this compare to investing in all loans? How does this compare to investing in all loans?
SelectedLoans = sqldf("select *
from HighInterest
order by PredictedRisk")
SelectedLoans = head(SelectedLoans,100)
SelectedLProfit = sum(SelectedLoans$profit)
SelectedLProfit - HighIntProfit
## [1] 308
11.1 Profit of the investor who invested 1 dollars in each of the loans will receive profit around 52 dollars 11.2 Compare to investing in all loans will receive more around 89 dollars
set.seed(99)
split = sample.split (df$fico, SplitRatio = 0.70)
df_train = subset(df, split == TRUE)
df_test = subset(df, split == FALSE)
Decision Tree with Regression predict fico
prpDCTR = rpart(fico ~ ., df_train,control = rpart.control(cp= 0.004))
PredictFico = predict(prpDCTR, df_test, method = "anova")
plot(df_test$fico, PredictFico)
DeciTrRM=sqrt(mean((df_test$fico - PredictFico)^2))
rpart.plot(prpDCTR)
rpart.rules(prpDCTR, cover=TRUE)
## fico cover
## 660 when int.rate >= 0.132 & purpose is all_other or credit_card or debt_consolidation or educational or home_improvement or major_purchase & credit.policy is 0 & inq.last.6mths < 4 5%
## 676 when int.rate >= 0.132 & purpose is all_other or credit_card or debt_consolidation or educational or home_improvement or major_purchase & credit.policy is 0 & inq.last.6mths >= 4 6%
## 681 when int.rate >= 0.132 & purpose is all_other or credit_card or debt_consolidation or educational or home_improvement or major_purchase & credit.policy is 1 & inq.last.6mths < 4 20%
## 693 when int.rate is 0.122 to 0.132 & installment < 401 11%
## 696 when int.rate >= 0.132 & revol.util >= 14 & purpose is small_business 3%
## 707 when int.rate is 0.096 to 0.122 & revol.util >= 25 & installment < 414 16%
## 713 when int.rate is 0.122 to 0.132 & installment >= 401 4%
## 721 when int.rate is 0.096 to 0.122 & revol.util < 25 & installment < 414 6%
## 734 when int.rate is 0.096 to 0.122 & revol.util >= 11 & installment >= 414 7%
## 738 when int.rate >= 0.132 & revol.util < 14 & purpose is small_business 1%
## 747 when int.rate < 0.096 & revol.util >= 17 12%
## 753 when int.rate < 0.096 & revol.util < 17 & days.with.cr.line < 3810 3%
## 754 when int.rate >= 0.132 & purpose is all_other or credit_card or debt_consolidation or educational or home_improvement or major_purchase & credit.policy is 1 & inq.last.6mths >= 4 0%
## 765 when int.rate is 0.096 to 0.122 & revol.util < 11 & installment >= 414 2%
## 778 when int.rate < 0.096 & revol.util < 17 & days.with.cr.line >= 3810 6%
plotcp(prpDCTR)
DeciTrRM
## [1] 22.63712
LinearFiRM
## [1] 22.19836
cat("Different of RMSE is Decision Tree Model RMSE - Linear Model RMSE: ",DeciTrRM - LinearFiRM)
## Different of RMSE is Decision Tree Model RMSE - Linear Model RMSE: 0.4387531
RMSE of Decision Tree is at 22.63 for Linear Regression is 20.67 It means that Linear Regression slightly better because RMSE is lower than 1.96
Decision Tree predict not.fully.paid
set.seed(99)
split = sample.split (df$not.fully.paid, SplitRatio = 0.70)
df_train = subset(df, split == TRUE)
df_test = subset(df, split == FALSE)
nfpDCTM = rpart(not.fully.paid~.,data=df_train, method = 'class',cp=0.002)
prp(nfpDCTM)
rpart.plot(nfpDCTM)
printcp(nfpDCTM)
##
## Classification tree:
## rpart(formula = not.fully.paid ~ ., data = df_train, method = "class",
## cp = 0.002)
##
## Variables actually used in tree construction:
## [1] credit.policy days.with.cr.line delinq.2yrs dti
## [5] inq.last.6mths installment int.rate log.annual.inc
## [9] purpose revol.util
##
## Root node error: 1009/6442 = 0.15663
##
## n= 6442
##
## CP nsplit rel error xerror xstd
## 1 0.0023125 0 1.00000 1.0000 0.028911
## 2 0.0020000 20 0.93756 1.0575 0.029571
plotcp(nfpDCTM)
{r} # nfpPred = predict(nfpDCTM, newdata=df_test,type='class') # table = table(nfpPred, df_test$not.fully.paid) # table # accuracy = sum(diag(table))/(sum(table)) # cat("The accuracy is", round(accuracy,3)) #nfpPred = predict(nfpDCTM, newdata=df_test,type='prob')
nfpPred_Cat = ifelse(nfpPred[, 2] > 0.2, 1, 0)
table = table(nfpPred_Cat, df_test$not.fully.paid)
table
##
## nfpPred_Cat 0 1
## 0 2017 297
## 1 312 135
accuracy = sum(diag(table))/(sum(table))
cat("The accuracy is", round(accuracy,2))
## The accuracy is 0.78
ROCRpred = prediction (nfpPred[, 2], df_test$not.fully.paid)
AUCVal12 = as.numeric (performance (ROCRpred, "auc") @y.values) # higher auc value is better
performance (ROCRpred, "tpr", "fpr")
## A performance instance
## 'False positive rate' vs. 'True positive rate' (alpha: 'Cutoff')
## with 20 data points
plot (ROCRperf, colorize = TRUE, print.cutoffs.at = seq (0, 1, by = 0.1), text.adj = c(-0.2, 1.7))
AUC of Logistic Regression is higher than AUC of task 12 by 0.09
AUCVal6
## [1] 0.6817671
AUCVal12
## [1] 0.5849698
AUCVal6 - AUCVal12
## [1] 0.09679733
Random Forest with Regression to predict fico
set.seed(99)
split = sample.split (df$fico, SplitRatio = 0.70)
df_train = subset(df, split == TRUE)
df_test = subset(df, split == FALSE)
FicoForest = randomForest(fico~.,df_train, ntree=700, mtry=2)
PredictForest = predict(FicoForest, df_test)
RanForRM = round(sqrt(mean((df_test$fico - PredictForest)^2)),2)
cat("rmse is:", RanForRM)
## rmse is: 18.11
RanForRM
## [1] 18.11
LinearFiRM
## [1] 22.19836
cat("Different of RMSE is Random Forest Model RMSE - Linear Model RMSE: ",RanForRM - LinearFiRM)
## Different of RMSE is Random Forest Model RMSE - Linear Model RMSE: -4.088365
Random Forest with Classification to predict not.fully.paid
set.seed(99)
split = sample.split (df$not.fully.paid, SplitRatio = 0.70)
df_train = subset(df, split == TRUE)
df_test = subset(df, split == FALSE)
NFPForest = randomForest(not.fully.paid ~ ., data = df_train, mtry = 3, ntree = 50)
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
PredictNFP = predict(NFPForest, df_test, type = "class")
df_test$RandomForest = predict(NFPForest, df_test, type = "class")
plot(PredictNFP)
df_test$PredictedRisk = ifelse(df_test$RandomForest > 0.25,1,0)
table(df_test$not.fully.paid, df_test$PredictedRisk)
##
## 0 1
## 0 1939 390
## 1 284 148
ROCRpred = prediction (PredictNFP, df_test$not.fully.paid)
ROCRperf = performance (ROCRpred, "tpr", "fpr")
plot (ROCRperf, colorize = TRUE, print.cutoffs.at = seq (0, 1, by = 0.1), text.adj = c(-0.2, 1.7))
AUCVal = as.numeric (performance (ROCRpred, "auc") @y.values)
cat("The AUC Value of my model is", round(AUCVal,3))
## The AUC Value of my model is 0.67
df = rawdf
df = na.omit(df)
df = df[-which(df$fico > 850 ),]
df = df[-which(df$int.rate> 5),]
df = df[-which(df$revol.util > 500),]
df = df[-which(df$revol.bal > 75000),]
df = df[-c(1)]
cols_num = c(1,12,13,6,10)
df[cols_num] = sapply(df[cols_num],as.numeric)
## Warning in lapply(X = X, FUN = FUN, ...): NAs introduced by coercion
## Warning in lapply(X = X, FUN = FUN, ...): NAs introduced by coercion
df_cluster = df
df_cluster = fastDummies::dummy_cols(df_cluster, select_columns = "purpose")
df_cluster = df_cluster[-c(2)]
Scale and build cluster
df_cluster = df_cluster[c(1,2,6,7,9)]
df_cluster = na.omit(df_cluster)
df_scale = scale(df_cluster)
km.out = kmeans(df_scale, 3 ,nstart=20)
N = 10
information = rep ( NA, N )
for ( i in 1: N ){
KM = kmeans ( df_scale, centers = i, iter.max = 35, nstart = 10 )
information [ i ] = KM$tot.withinss
}
plot ( information ~ seq ( 1:N ), type = "b", pch = 1, col = 2, ylab = "Total within Sum of Squares", lwd=2,
xlab = "Number of Clusters", main = "Selecting K by elbow method" )
plot(df_cluster, col = km.out$cluster)
cluster1 = subset(df_cluster, km.out$cluster == 1)
cluster2 = subset(df_cluster, km.out$cluster == 2)
cluster3 = subset(df_cluster, km.out$cluster == 3)